######################################################################################
# Project:  Complexity Project
#
# Task:     Regress FK and Syntactic complexity on pc and lr
#
# Author:   Martijn Schoonvelde
# Date:     November 2021
#
######################################################################################

#load libraries

rm(list = ls())

library(quanteda)
library(stringr)
library(tidyverse)


#load data

cong.corpus.sample <- read.csv("Data/Corp_Congresso_Sample_Validation.csv")
bt.corpus.sample <- read.csv("Data/Corp_Bundestag_Sample_Validation.csv")
tk.corpus.sample <- read.csv("Data/Corp_TweedeKamer_Sample_Validation.csv")
hoc.corpus.sample <- read.csv("Data/Corp_HouseofCommons_Sample_Validation.csv")
euspeech.corpus <- read.csv("Data/EUSPeech_Validation.csv")
dutch.congress.corpus <- read.csv("Data/Congress_Dutch_Validation.csv")

#generate year variable
bt.corpus.sample$year2 <- bt.corpus.sample$year - min(bt.corpus.sample$year)
cong.corpus.sample$year2 <- cong.corpus.sample$year - min(cong.corpus.sample$year)
hoc.corpus.sample$year2 <- hoc.corpus.sample$year - min(hoc.corpus.sample$year)
tk.corpus.sample$year2 <- tk.corpus.sample$year - min(tk.corpus.sample$year)
euspeech.corpus$year2 <- euspeech.corpus$year - min(euspeech.corpus$year)
dutch.congress.corpus$year2 <- dutch.congress.corpus$year - min(dutch.congress.corpus$year)

# Run regression analyses -------------------------------------------------
fk_vars <- as.formula(scale(flesh.grade) ~ scale(lr) + scale(pc) + year2 + cabinet.party) 
fk_vars_euspeech <- as.formula(scale(flesh.grade) ~ scale(lr) + scale(pc))
fk_vars_dutch_congress <- as.formula(scale(flesh.grade) ~ scale(lr) + scale(pc) + year2) 

reg.output <- list()
reg.output[[1]] <- lm(fk_vars, data = bt.corpus.sample)
reg.output[[2]] <- lm(fk_vars, data = cong.corpus.sample)
reg.output[[3]] <- lm(fk_vars, data = hoc.corpus.sample)
reg.output[[4]] <- lm(fk_vars, data = tk.corpus.sample)
reg.output[[5]] <- lm(fk_vars_euspeech, data = euspeech.corpus)
reg.output[[6]] <- lm(fk_vars_dutch_congress, data = euspeech.corpus)

#figures progressive conservative FK plot
####################################################

coefs <- unlist(lapply(reg.output, function(x) x$coefficients[3]))
se <- unlist(lapply(reg.output, function(x) sqrt(diag(vcov(x)))[3]))
min <- coefs - 1.96*se
max <- coefs + 1.96*se
names <- c("DE-Bundestag", "ES-Congresso", "UK-HoC", "NL-Tweedekamer", "Prime Ministers", "Dutch Congress Speeches") 
plot.data.fk <- data.frame(names,coefs,se,min,max)
plot.data.fk$names <- factor(plot.data.fk$names, levels=plot.data.fk$names)



fk.sample.plot<- ggplot(plot.data.fk[], aes(x=names, y=coefs)) +
  geom_point(size = 0.75) +
  geom_errorbar(ymin=plot.data.fk$min[],ymax=plot.data.fk$max[], width=.15) +
  coord_flip() + 
  ylab("Effect of lib-cons ideology on Flesch Kincaid complexity") +
  scale_y_continuous(limits = c(-0.4, 0.2)) + 
  geom_hline(yintercept=0) +
  theme_minimal() 


#ggsave(fk.sample.plot, file="fk_sample_coefficients.png", dpi=300)


# Run regression analyses -------------------------------------------------
syntactic_complexity_vars <- as.formula(syntactic_complexity_scale ~ scale(lr) + scale(pc) + year2 + cabinet.party)
syntactic_complexity_vars_euspeech <- as.formula(syntactic_complexity_scale ~ scale(lr) + scale(pc))
syntactic_complexity_vars_dutch_congress <- as.formula(syntactic_complexity_scale ~ scale(lr) + scale(pc) + year2)

reg.output <- list()
reg.output[[1]] <- lm(syntactic_complexity_vars, data = bt.corpus.sample)
reg.output[[2]] <- lm(syntactic_complexity_vars, data = cong.corpus.sample)
reg.output[[3]] <- lm(syntactic_complexity_vars, data = hoc.corpus.sample)
reg.output[[4]] <- lm(syntactic_complexity_vars, data = tk.corpus.sample)
reg.output[[5]] <- lm(syntactic_complexity_vars_euspeech, data = euspeech.corpus)
reg.output[[6]] <- lm(syntactic_complexity_vars_dutch_congress, data = dutch.congress.corpus)

#figures progressive conservative syntactic complexity plot
##########################################################

coefs <- unlist(lapply(reg.output, function(x) x$coefficients[3]))
se <- unlist(lapply(reg.output, function(x) sqrt(diag(vcov(x)))[3]))
min <- coefs - 1.96*se
max <- coefs + 1.96*se
names <- c("DE-Bundestag", "ES-Congresso", "UK-HoC", "NL-Tweedekamer", "Prime Ministers", "Dutch Congress Speeches") 
plot.data.synt.comp <- data.frame(names,coefs,se,min,max)
plot.data.synt.comp$names <- factor(plot.data.synt.comp$names, levels=plot.data.synt.comp$names)


sc.sample.plot<- ggplot(plot.data.synt.comp[], aes(x=names, y=coefs)) +
  geom_point(size = 0.75) +
  geom_errorbar(ymin=plot.data.synt.comp$min[],ymax=plot.data.synt.comp$max[], width=.15) +
  coord_flip() + 
  ylab("Effect of lib-cons ideology on syntactic complexity scale") +
  scale_y_continuous(limits = c(-0.2, 0.2)) + 
  geom_hline(yintercept=0) +
  theme_minimal() 


#ggsave(sc.sample.plot, file="synactic_complexity_sample_coefficients.png", dpi=300)


# Run regression analyses -------------------------------------------------
syntactic_complexity_vars_min <- as.formula(syntactic_complexity_scale_minimal ~ scale(lr) + scale(pc) + year2 + cabinet.party)
syntactic_complexity_vars_min_euspeech <- as.formula(syntactic_complexity_scale_minimal ~ scale(lr) + scale(pc))
syntactic_complexity_vars_min_dutch_congress <- as.formula(syntactic_complexity_scale_minimal ~ scale(lr) + scale(pc) + year2)

reg.output <- list()
reg.output[[1]] <- lm(syntactic_complexity_vars_min, data = bt.corpus.sample)
reg.output[[2]] <- lm(syntactic_complexity_vars_min, data = cong.corpus.sample)
reg.output[[3]] <- lm(syntactic_complexity_vars_min, data = hoc.corpus.sample)
reg.output[[4]] <- lm(syntactic_complexity_vars_min, data = tk.corpus.sample)
reg.output[[5]] <- lm(syntactic_complexity_vars_min_euspeech, data = euspeech.corpus)
reg.output[[6]] <- lm(syntactic_complexity_vars_dutch_congress, data = dutch.congress.corpus)

#figures progressive conservative syntactic mininal plot
########################################################

coefs <- unlist(lapply(reg.output, function(x) x$coefficients[3]))
se <- unlist(lapply(reg.output, function(x) sqrt(diag(vcov(x)))[3]))
min <- coefs - 1.96*se
max <- coefs + 1.96*se
names <- c("DE-Bundestag", "ES-Congresso", "UK-HoC", "NL-Tweedekamer", "Prime Ministers", "Dutch Congress Speeches") 
plot.data.synt.comp.min <- data.frame(names,coefs,se,min,max)
plot.data.synt.comp.min$names <- factor(plot.data.synt.comp.min$names, levels=plot.data.synt.comp.min$names)


sc.sample.plot<- ggplot(plot.data.synt.comp.min[], aes(x=names, y=coefs)) +
  geom_point(size = 0.75) +
  geom_errorbar(ymin=plot.data.synt.comp.min$min[],ymax=plot.data.synt.comp.min$max[], width=.15) +
  coord_flip() + 
  ylab("Effect of lib-cons ideology on syntactic complexity scale") +
  scale_y_continuous(limits = c(-0.4, 0.4)) + 
  geom_hline(yintercept=0) +
  theme_minimal() 


#ggsave(sc.sample.plot, file="semantic_complexity_sample_coefficients.png", dpi=300)


#plot all together


plot.data.fk$dv <- "Flesch-Kincaid"
plot.data.synt.comp$dv <- "4 indicators of syntactic complexity"
plot.data.synt.comp.min$dv <- "2 indicators of syntactic complexity"

plot.data <- rbind(plot.data.fk, plot.data.synt.comp, plot.data.synt.comp.min)

names(plot.data)[6] <- "dependent_variable"


total.sample.plot<- ggplot(plot.data[], aes(x=names, 
                                            y=coefs, 
                                            shape = dependent_variable,
                                            ymin = min,
                                            ymax = max)) +
  geom_pointrange(position = position_dodge(width = 0.5)) +
  coord_flip() + 
  ylab("Effect of lib-cons ideology on measures of syntactic complexity") +
  xlab("Corpora") +  theme(legend.title=element_blank()) + 
 # scale_y_continuous(limits = c(-0.45, 0.25)) + 
  geom_hline(yintercept=0) +
  theme_minimal()


ggsave(total.sample.plot, file="total_complexity_sample_coefficients.png",
       dpi=600,
       width = 7, 
       height = 7)


